
rm(list=ls(all=TRUE))

library(dplyr)
library(tidyr)
library(survival)
library(survminer)

wkdir <- 'working folder where the .csv files are saved'
setwd(wkdir)

######################################################## dat.GE107: 
raw107<-read.csv('_raw gene expression of 107 samples.csv',stringsAsFactors = F)

for (i in 1:15)
{
  inx1 <- 2*i
  inx2 <- 2*i + 1
  
  add<-
    raw107 %>% 
    group_by(Sample.ID) %>%
    select(Sample.ID, inx1, inx2)
  
  names(add)<-c('Sample.ID','Gene','Cq')
  add<-data.frame(add,stringsAsFactors = F)
  if (i==1) tmp<-add else tmp<-rbind(tmp,add)
}

tmp$Gene<-gsub(' ', '',tmp$Gene)

dat.GE107<-data.frame(Sample.ID=NULL,Cq.StuGene=NULL,Cq.CtlGene=NULL) #dataset of gene expression (GE) for 107 samples    

for (j in seq(1, nrow(tmp), 6)) 
{
  Sample.ID=tmp$Sample.ID[j]
  Gene.mix=NULL
  Cq.StuGene=NULL
  Cq.CtlGene=NULL
  
  for (k in 0:2)
    if (toupper(tmp$Gene[j+k])%in%c('C1','C2','C3') & toupper(tmp$Gene[j+k+3])%in%c('PR1','PR2','PR3','PC1','PC2')) 
    {
      Cq.CtlGene=c(Cq.CtlGene,tmp$Cq[j+k])
      Cq.StuGene=c(Cq.StuGene,tmp$Cq[j+k+3])
      Gene.mix=c(Gene.mix, paste(tmp$Gene[j+k+3],tmp$Gene[j+k],sep=''))
    }
  
  dat.GE107<-rbind(dat.GE107,
                   data.frame(Sample.ID, Gene.mix, Cq.StuGene, Cq.CtlGene))
}

dat.GE107<-data.frame(dat.GE107,negdt=dat.GE107$Cq.CtlGene-dat.GE107$Cq.StuGene)

summ.GE107<-
  dat.GE107 %>% group_by(Sample.ID,Gene.mix) %>%
  summarise(mean.negdt=mean(negdt))
summ.GE107<-data.frame(summ.GE107,stringsAsFactors = F)

genemix.tpl <- c("PC1C1", "PC1C2", "PC1C3",
                 "PC2C1", "PC2C2", "PC2C3",
                 "PR1C1", "PR1C2", "PR1C3",
                 "PR2C1", "PR2C2", "PR2C3",
                 "PR3C1", "PR3C2", "PR3C3")

for (s in unique(summ.GE107$Sample.ID))
{
  dat.s<-summ.GE107[summ.GE107$Sample.ID==s,]
  dat.s<-dat.s[match(genemix.tpl,dat.s$Gene.mix),]
  
  if (s==unique(summ.GE107$Sample.ID)[1]) mean.GE107<-dat.s$mean.negdt else
    mean.GE107<-data.frame(mean.GE107,dat.s$mean.negdt)
}
colnames(mean.GE107)<-unique(summ.GE107$Sample.ID)
rownames(mean.GE107)<-genemix.tpl


######################################################## dat.GE24
raw24<-read.csv('_raw gene expression of 24 normal samples.csv',stringsAsFactors = F)

for (i in 1:15)
{
  inx1 <- 2*i
  inx2 <- 2*i+1
  
  add<-
    raw24 %>% 
    group_by(Sample.ID) %>%
    select(Sample.ID, inx1, inx2)
  names(add)<-c('Sample.ID','Gene','Cq')
  add<-data.frame(add,stringsAsFactors = F)
  if (i==1) tmp<-add else tmp<-rbind(tmp,add)
}

tmp$Gene<-gsub(' ', '',tmp$Gene)

dat.GE24<-data.frame(Sample.ID=NULL,Cq.StuGene=NULL,Cq.CtlGene=NULL)    
for (j in seq(1,nrow(tmp),6))
{
  Sample.ID=tmp$Sample.ID[j]
  Gene.mix=NULL
  Cq.StuGene=NULL
  Cq.CtlGene=NULL
  
  for (k in 0:2)
    if (toupper(tmp$Gene[j+k])%in%c('C1','C2','C3') & toupper(tmp$Gene[j+k+3])%in%c('PR1','PR2','PR3','PC1','PC2')) 
    {
      Cq.CtlGene=c(Cq.CtlGene,tmp$Cq[j+k])
      Cq.StuGene=c(Cq.StuGene,tmp$Cq[j+k+3])
      Gene.mix=c(Gene.mix, paste(tmp$Gene[j+k+3],tmp$Gene[j+k],sep=''))
    }
  
  dat.GE24<-rbind(dat.GE24,
                  data.frame(Sample.ID, Gene.mix, Cq.StuGene, Cq.CtlGene))
}

dat.GE24<-data.frame(dat.GE24,negdt=dat.GE24$Cq.CtlGene-dat.GE24$Cq.StuGene)

summ.GE24<-
  dat.GE24 %>% group_by(Sample.ID,Gene.mix) %>%
  summarise(mean.negdt=mean(negdt))
summ.GE24<-data.frame(summ.GE24,stringsAsFactors = F)

for (s in unique(summ.GE24$Sample.ID))
{
  dat.s<-summ.GE24[summ.GE24$Sample.ID==s,]
  dat.s<-dat.s[match(genemix.tpl,dat.s$Gene.mix),]
  
  if (s==unique(summ.GE24$Sample.ID)[1]) mean.GE24<-dat.s$mean.negdt else
    mean.GE24<-data.frame(mean.GE24,dat.s$mean.negdt)
}
colnames(mean.GE24)<-unique(summ.GE24$Sample.ID)
rownames(mean.GE24)<-genemix.tpl


######################################################## dat.clinc
dat.clinc <- read.csv('_clinic information of 107 samples.csv', stringsAsFactors = F) 


######################################################## Table 1
# Function for correct rounding, because 'round' does not work properly for some cases, check this: 
# x = c(1.85, 1.54, 1.65, 1.85, 1.84)
# round(x, 1) 
round2 = function(x, n) {
  posneg = sign(x)
  z = abs(x)*10^n
  z = z + 0.5
  z = trunc(z)
  z = z/10^n
  z*posneg
}

tab1.num<-
  gather(dat.clinc[,colnames(dat.clinc)%in%c('AGE', 'YearsDiff')],'var','value') %>%
  group_by(var,!is.na(value)) %>%
  summarise(val=paste(min(value),max(value),sep=' - '),
            n=length(value),
            m.std=paste(round(mean(value),2),
                        "(",
                        round(sd(value),2),
                        ")", 
                        sep=''))
tab1.num<-data.frame(tab1.num[-2,-2])

tab1.cate<-
  gather(dat.clinc[,colnames(dat.clinc)%in%
               c('Gleason.Grade', 'Path.Stage', 'RECUR_YesNo')],'var','value') %>%
  count(var,value)

tab1.cate$prop <- round2(100*tab1.cate$n/nrow(dat.clinc), 1)

######################################################## Figure 1
GE_summ<-function(mean.GE) #a func for gene expression summary
{
  dat.PR1<-mean.GE[grep('PR1', rownames(mean.GE)),]
  dat.PR2<-mean.GE[grep('PR2', rownames(mean.GE)),]
  dat.PR3<-mean.GE[grep('PR3', rownames(mean.GE)),]
  dat.PC1<-mean.GE[grep('PC1', rownames(mean.GE)),]
  dat.PC2<-mean.GE[grep('PC2', rownames(mean.GE)),]
  
  dat.assay<-rbind(sapply(dat.PR1,median),
                   sapply(dat.PR2,median),
                   sapply(dat.PR3,median),
                   sapply(dat.PC1,median),
                   sapply(dat.PC2,median))
  
  rownames(dat.assay)<-c('PR1','PR2','PR3','PC1','PC2')
  dat.assay<-as.data.frame(dat.assay)
  
  dat.PR<-dat.assay[1:3,]
  dat.PC<-dat.assay[4:5,]
  
  dat.gene.med<-rbind(sapply(dat.PR,median),
                      sapply(dat.PC,median))
  rownames(dat.gene.med)<-c('PR_med','PC_med')
  
  dat.gene.mean<-rbind(sapply(dat.PR,mean),
                       sapply(dat.PC,mean))
  rownames(dat.gene.mean)<-c('PR_mean','PC_mean')
  
  dat.gene<-as.data.frame(rbind(dat.gene.med,dat.gene.mean))
  
  dat.gene
}

dat.tumor <- GE_summ(mean.GE107)

dat.normal <- GE_summ(mean.GE24)

## 
dat.tumor24 <-
  dat.tumor[,colnames(dat.tumor)%in%dat.clinc$Sample.ID.107[dat.clinc$Sample.ID.24!='']]

tmp <- data.frame(gene=rownames(dat.tumor24), dat.tumor24, stringsAsFactors = F)
rownames(tmp)<-NULL

dat.tumor24.long <- gather(tmp, spl, expression, -gene)

##
tmp <- data.frame(gene=rownames(dat.normal), dat.normal, stringsAsFactors = F)
rownames(tmp)<-NULL

dat.normal.long <- gather(tmp, spl, expression, -gene)

dat.24.long <- rbind(data.frame(dat.tumor24.long,condition='tumor'),
                     data.frame(dat.normal.long,condition='normal'))

dat.24.long.med <- dat.24.long[dat.24.long$gene%in%c('PR_med','PC_med'),]

#### Fig 1 stat
aggregate(dat.24.long$expression, list(dat.24.long$condition, dat.24.long$gene), 
          function(x) c(mean(x, na.rm=TRUE), sd(x, na.rm=TRUE)))


########### Fig 1a:
par(mar=c(4, 4, 3, 1))

bp <- boxplot(expression ~ condition + gene, data=dat.24.long.med, ylim=c(-6, 7),
              main='Discovery Cohort', ylab='Normalized Expression', xlab='',
              names=c('Normal', 'Tumor', 'Normal', 'Tumor'), col='white')

sp <- paste0(rep(' ', 55), collapse='')

mtext(paste('PCA3', sp, 'Prune2', sep=''), side = 1, line = 2)

segments(x0=2.5, y0=-8, x1=2.5, y1=8, lty=2)

#### Wilcoxon signed rank test for paired observations
## PC
PC_med.tumor <- dat.24.long[dat.24.long$condition=='tumor' & dat.24.long$gene=='PC_med', c('spl', 'expression'),]
PC_med.tumor <- merge(PC_med.tumor, dat.clinc[,c('Sample.ID.107', 'Sample.ID.24')], 
                      by.x='spl', by.y='Sample.ID.107', sort=F, all.x=T)

PC_med.normal <- dat.24.long[dat.24.long$condition=='normal' & dat.24.long$gene=='PC_med', c('spl', 'expression'),]

PC_med <- merge(PC_med.tumor, PC_med.normal, by.x='Sample.ID.24', by.y='spl', sort=F, all.x=T)
#PC_med <- PC_med[!is.na(PC_med$expression.y),]

wilcox.test(PC_med$expression.x, PC_med$expression.y, paired=TRUE)

## PR
PR_med.tumor <- dat.24.long[dat.24.long$condition=='tumor' & dat.24.long$gene=='PR_med', c('spl', 'expression'),]
PR_med.tumor <- merge(PR_med.tumor, dat.clinc[,c('Sample.ID.107', 'Sample.ID.24')], 
                      by.x='spl', by.y='Sample.ID.107', sort=F, all.x=T)

PR_med.normal <- dat.24.long[dat.24.long$condition=='normal' & dat.24.long$gene=='PR_med', c('spl', 'expression'),]

PR_med <- merge(PR_med.tumor, PR_med.normal, by.x='Sample.ID.24', by.y='spl', sort=F, all.x=T)

wilcox.test(PR_med$expression.x, PR_med$expression.y, paired=TRUE)

segments(x0=1, y0=6, x1=2, y1=6)
segments(x0=1, y0=6, x1=1, y1=5.75)
segments(x0=2, y0=6, x1=2, y1=5.75)
text(x=1.5, y=6.5, labels='p < 0.001', cex=0.9)

segments(x0=3, y0=3, x1=4, y1=3)
segments(x0=3, y0=3, x1=3, y1=2.75)
segments(x0=4, y0=3, x1=4, y1=2.75)
text(x=3.5, y=3.5, labels='p < 0.001', cex=0.9)


########### Fig 1b:
tmp <- data.frame(gene=rownames(dat.tumor), dat.tumor, stringsAsFactors = F)
rownames(tmp)<-NULL

dat.tumor.long <- gather(tmp, spl, expression, -gene)

dat.anlz <- merge(dat.tumor.long[dat.tumor.long$gene=='PR_med', c('spl', 'gene', 'expression')],
                  dat.clinc[,!names(dat.clinc)=='Sample.ID.24'], by.x='spl', by.y='Sample.ID.107')

dat.anlz <- dat.anlz[, !names(dat.anlz)=='gene']
names(dat.anlz)[names(dat.anlz)=='expression'] <- 'PR'

dat.anlz <- merge(dat.tumor.long[dat.tumor.long$gene=='PC_med', c('spl', 'gene', 'expression')], 
                  dat.anlz, by='spl')
dat.anlz <- dat.anlz[, !names(dat.anlz)=='gene']
names(dat.anlz)[names(dat.anlz)=='expression'] <- 'PC'

dat.anlz$Ratio <- dat.anlz$PR/dat.anlz$PC

boxplot(PC ~ RECUR_YesNo, data=dat.anlz[!dat.anlz$RECUR_YesNo=='LTF',])

bp <- boxplot(PC ~ RECUR_YesNo, data=dat.anlz[!dat.anlz$RECUR_YesNo=='LTF',], ylim=c(-3.5, 7.5),
              main='Discovery Cohort', ylab='PCA3 Normalized Expression', xlab='Biomedical Recurrence',
              names=c('No', 'Yes'), col='white')

wilcox.test(PC ~ RECUR_YesNo, data=dat.anlz[!dat.anlz$RECUR_YesNo=='LTF',])$p.value #0.1594


segments(x0=1, y0=6.75, x1=2, y1=6.75)
segments(x0=1, y0=6.75, x1=1, y1=6.5)
segments(x0=2, y0=6.75, x1=2, y1=6.5)
text(x=1.5, y=7, labels='p = 0.1594', cex=0.9)

#### stat for Fig 1B
aggregate(dat.anlz$PR, list(dat.anlz$RECUR_YesNo), mean)
wilcox.test(PR ~ RECUR_YesNo, data=dat.anlz[!dat.anlz$RECUR_YesNo=='LTF',])$p.value

aggregate(dat.anlz$PC, list(dat.anlz$RECUR_YesNo), mean)
wilcox.test(PC ~ RECUR_YesNo, data=dat.anlz[!dat.anlz$RECUR_YesNo=='LTF',])$p.value

aggregate(dat.anlz$Ratio, list(dat.anlz$RECUR_YesNo), mean)
wilcox.test(Ratio ~ RECUR_YesNo, data=dat.anlz[!dat.anlz$RECUR_YesNo=='LTF',])$p.value


########### Fig 2
mean.PR<-mean(dat.anlz$PR)
dat.anlz$class.PR<-ifelse(dat.anlz$PR>mean.PR,2,1)

mean.PC<-mean(dat.anlz$PC)
dat.anlz$class.PC<-ifelse(dat.anlz$PC>mean.PC,2,1)

mean.Ratio<-mean(dat.anlz$Ratio)
dat.anlz$class.Ratio<-ifelse(dat.anlz$PR>mean.Ratio,2,1)

dat.anlz$PR_grp <- ''
dat.anlz$PR_grp[dat.anlz$class.PR==1] <- 'Low'
dat.anlz$PR_grp[dat.anlz$class.PR==2] <- 'High'

dat.anlz$PC_grp <- ''
dat.anlz$PC_grp[dat.anlz$class.PC==1] <- 'Low'
dat.anlz$PC_grp[dat.anlz$class.PC==2] <- 'High'

dat.anlz$status <- 0
dat.anlz$status[(dat.anlz$RECUR_YesNo=='Y')] <- 1

##
sfit.PC <- survfit(Surv(YearsDiff, status) ~ PC_grp, data=dat.anlz)

p.PC <- ggsurvplot(sfit.PC, pval=TRUE, #conf.int=TRUE, 
                   legend.title="PCA3 expression", 
                   legend.labs=c('High', 'Low'),
                   risk.table=TRUE,
                   palette=c("red", "blue"), 
                   title="Discovery Cohort", 
                   risk.table.height=.2,
                   xlab='Years Post Radical Prostatectomy',
                   ylab='Biochemical PFS Probability')

p.PC


##
sfit.PR <- survfit(Surv(YearsDiff, status) ~ PR_grp, data=dat.anlz)

p.PR <- ggsurvplot(sfit.PR, pval=TRUE, #conf.int=TRUE, 
                   legend.title="PRUNE2 expression", 
                   legend.labs=c('High', 'Low'),
                   risk.table=TRUE,
                   palette=c("red", "blue"), 
                   title="Discovery Cohort", 
                   #risk.table.height=.15,
                   xlab='Years Post Radical Prostatectomy',
                   ylab='Biochemical PFS Probability')

p.PR

##
sfit.Ratio <- survfit(Surv(YearsDiff, status) ~ class.Ratio, data=dat.anlz)

p.Ratio <- ggsurvplot(sfit.Ratio, pval=TRUE, #conf.int=TRUE, 
                      legend.title="Ratio", 
                      risk.table=TRUE,
                      palette=c("red", "blue"), 
                      title="Discovery Cohort", 
                      xlab='Years Post Radical Prostatectomy',
                      ylab='Biochemical PFS Probability')

p.Ratio

